# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.metrics import make_scorer
from sklearn import cross_validation
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split


# 检测预测准确度的函数
def mean_absolute_percentage_error(y_true, y_pred):
    # convert pd.series to numpy.ndarray
    y_true = y_true.as_matrix()

    dim = y_true.shape
    mape = np.abs((y_true - y_pred) / y_true).sum() / float(dim[0])
    return mape

score_func = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

# 加载数据
data_file = './price_train.csv'
data = pd.read_csv(data_file, sep=';')

# 填充缺失值
data['greening_rate'] = data["greening_rate"].fillna(data["greening_rate"].median())
data['year'] = data["year"].fillna(data["year"].median())

# split train data and label
predictors = ['district', 'loop_location', 'building_type', 'greening_rate', 'year', 'has_subway', 'is_hutong']
train = data[predictors]
labels = data['price']

# vectorize
train = train.T.to_dict().values()
vec = DictVectorizer()
train = vec.fit_transform(train)

# split train and test data
X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.25, random_state=42)

# 支持向量机模型
svr = SVR(kernel='rbf', C=650000, gamma=0.008, epsilon=0.0086)

# 使用交叉验证方式训练模型并取得错误率
scores = cross_validation.cross_val_score(svr, X_train, y_train, cv=5, n_jobs=-1, scoring=score_func)

# 输出平均错误率
mean_score = np.abs(np.mean(scores))
print('cross validate score is %f' % (mean_score))

# 训练模型
svr.fit(X_train, y_train)

# 预测
y_pred = svr.predict(X_test)
error = mean_absolute_percentage_error(y_test, y_pred)
print('predict error %f' % (error))
